library(haven)
library(MCMCpack)
library(mvtnorm)
# clean workspace
rm(list=ls())
# set working directory in RStudio to directory where currently active script
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# import file
df_jobs <- read_dta('JOBSII_HR.dta')
df_jobs
Only 54% of individuals who were assigned to the intervention actually received the treatment. In the literature, this problem is known as noncompliance. Here we ignore noncompliance issues focusing on assessing causal effects of the assignment to the treatment (Intention-to-treat analysis).
So, I drop the W variable and focus on the assignment variable Z for assessing the causal effects of the treatment to ensures that analysis aligns with the intention-to-treat principle.
# Drop the W variable
df_jobs <- df_jobs[, !(names(df_jobs) == "W")]
# df_jobs
### Extra
table(df_jobs$Z)
0 1
130 268
prop.table(table(df_jobs$Z))
0 1
0.3266332 0.6733668
# table(df_jobs$Z, df_jobs$motivation)
# summary(df_jobs$motivation)
# unique(df_jobs$motivation)
# Outcome variables
outcome_vars <- c('depress6', 'employ6')
par(mfrow=c(1, 2))
for (var in outcome_vars) {
hist(df_jobs[[var]], freq=FALSE, main=paste(var,"| post treatment"), xlab=NULL, col="#FC4E07")}
#
outcome_vars <- c('depress0', 'depress6')
par(mfrow=c(2, 2))
for (var in outcome_vars) {hist(df_jobs[[var]][df_jobs$Z == 0], freq=FALSE, main=paste(var, "Control", sep=" | "), xlab=NULL, col="#FC4E07")}
for (var in outcome_vars) {hist(df_jobs[[var]][df_jobs$Z == 1], freq=FALSE, main=paste(var, "Treat", sep=" | "), xlab=NULL, col="#00AFBB")}
#
outcome_vars <- c('depress0', 'depress6')
par(mfrow=c(1, 2))
for (var in outcome_vars) {
hist(df_jobs[[var]], freq=FALSE, main=var, xlab=NULL, col="#FC4E07")}
# Binary variables
bin_vars <- c('sex', 'race', 'nonmarried')
par(mfrow=c(length(bin_vars), 3))
for (var in bin_vars) {
hist(df_jobs[[var]][df_jobs$Z == 0], xlab=NULL, main=paste(var, "Control", sep=" | "), col="#FC4E07")
hist(df_jobs[[var]][df_jobs$Z == 1], xlab=NULL, main=paste(var, "Treat", sep=" | "), col="#00AFBB")
hist(df_jobs[[var]], freq=FALSE, main=paste(var), xlab=NULL, col="green")
}
# Pre-treatment continuous variables
cont_pre_vars <- c('age', 'educ', 'EconHard', 'assertive', 'motivation', 'depress0')
# Pre-treatment continuous variables
rows <- 1.5
par(mfrow=c(2*rows, length(cont_pre_vars)/rows))
for (var in cont_pre_vars) {
hist(df_jobs[[var]][df_jobs$Z == 0], xlab=NULL, main=paste(var, "Control", sep=" | "), col="#FC4E07")
hist(df_jobs[[var]][df_jobs$Z == 1], xlab=NULL, main=paste(var, "Treat", sep=" | "), col="#00AFBB")
}
# Reset par
par(mfrow=c(1,1))
Looking at covariates it seems to be a similar distribution among the assignment to Control or Treatment. Considering depress0 there is an higher density for the low values in treatment group.
Looking at depress before and after treatment, for both control and treatment groups there is a shift to the lower levels of depression, showing the possibility that some other factors helped people, not only treatment in our study.
For continuous covariates, also report the medians, standard deviation and ranges within each treatment group. Record your results in a table. In a few sentences, comment on what you see and whether it is expected.
# header <- colnames(df_jobs)
all_var <- c("sex", "age", "race", "nonmarried", "educ", "EconHard", "assertive", "motivation", "depress0", "Z", "employ6", "depress6")
# Mean, sd, median, range by treatment of continuous vars
continuous_var <- c('age', 'educ', 'EconHard', 'assertive', 'motivation', 'depress0', 'depress6')
# Descriptive statistics
All_stat<-data.frame( cbind(
apply(df_jobs[,all_var], 2,mean),
apply(df_jobs[df_jobs$Z==0,all_var], 2,mean),
apply(df_jobs[df_jobs$Z==1,all_var], 2,mean)))
colnames(All_stat)<- c("Mean", " Mean-C", " Mean-T")
round(All_stat,2)
Cont_stat<-data.frame( cbind(
apply(df_jobs[df_jobs$Z==0,continuous_var], 2, median),
apply(df_jobs[df_jobs$Z==1,continuous_var], 2, median),
apply(df_jobs[df_jobs$Z==0,continuous_var], 2, sd),
apply(df_jobs[df_jobs$Z==1,continuous_var], 2, sd),
apply(df_jobs[df_jobs$Z==0,continuous_var], 2, min),
apply(df_jobs[df_jobs$Z==1,continuous_var], 2, min),
apply(df_jobs[df_jobs$Z==0,continuous_var], 2, max),
apply(df_jobs[df_jobs$Z==1,continuous_var], 2, max)))
colnames(Cont_stat)<- c(" Median-C", " Median-T", " s.d.-C", " s.d.-T", " Min-C", " Min-T", " Max-C", " Max-T")
round(Cont_stat,2)
Stat <- merge(All_stat, Cont_stat, by="row.names", all = TRUE)
Stat
In terms of randomization, considering for example the variable “age”, the mean value and standard deviation are quite the same for treated and non-treated groups, but it has a little significant difference between max age, 69 vs 61. Also by sex, there’s a 7 % discard between the two groups, along with other not-so-similar distributions like race, marital status. Could this maybe affect the results?
The depression level looks a little lower in the treatment group, that could be taken into account considering the outcome “depress6”. In both cases there is a drop down in the mean value of depression from pre to post treatment time.
In terms of employment at the end of the 6 months, there’s a difference in terms of mean value, but I have no pre-treatment values to do any consideration about. Speculating with Economic Hardness it seems to underline the effect of the treatment in employment status.
N <- nrow(df_jobs) # total sample size
Nt <- sum(df_jobs$Z==1) # number of treated units
Nc <- sum(df_jobs$Z==0) # number of controls
c(N, Nt, Nc)
[1] 398 268 130
##--------------------------------------------#
## Fisher's Exact p-value
##--------------------------------------------#
## Sharp null hypothesis that the treatment had no effect
## H0: Yi(1)=Yi(0) for i=1,...,N
## Considering outcome variable: “depression six months after the intervention assignment”
# possible combinations
nass <- choose(N,Nt) # number of assignment vectors
nass # we should compute the average treatment effect for each of them...
[1] 6.763821e+107
Absolute value of the difference in average outcomes by treatment status Absolute value of the difference in average ranks by treatment status.
# Absolute value of the difference in average outcomes by treatment status:
dif.ave.obs <- mean(df_jobs$depress6[df_jobs$Z==1]) - mean(df_jobs$depress6[df_jobs$Z==0])
dif.ave.obs
[1] -0.1360192
Tobs.dif.ave <- abs(dif.ave.obs)
Tobs.dif.ave
[1] 0.1360192
r <- rank(df_jobs$depress6, ties.method = "average")
dif.obs.r <- mean(r[df_jobs$Z==1]) - mean(r[df_jobs$Z==0])
dif.obs.r
[1] -21.53929
Tobs.dif.r <- abs(dif.obs.r)
Tobs.dif.r
[1] 21.53929
# We test the Sharp null against - so potential outcomes with or without treatment are the same, so we can construct them
# fix a seed for reproducibility
set.seed(23)
# P-values estimated using K draws from the randomization distribution:
K <- 5000
p.ave <- p.r <- 0 # P.value
Tdif.ave.dist <- Tdif.dist.r <- NULL # initializing vectors
# at every iteration sample a N dimention vector in q
for(k in 1:K){
Z.sim <- sample(df_jobs$Z, N, replace=FALSE) # simulation, doesn't metter if with replacement or not
dif.ave <- mean(df_jobs$depress6[Z.sim==1]) - mean(df_jobs$depress6[Z.sim==0]) # mean diff
Tdif.ave <- abs(dif.ave) # abs value
Tdif.ave.dist <- c(Tdif.ave.dist, Tdif.ave) # Appends Tdif.ave to Tdif.ave.dist to keep track of distribution under null hypothesis
p.ave <- p.ave + 1*(Tdif.ave>=Tobs.dif.ave) # Updates a counter if observed mean difference is greater than to simulated mean differences
# Rank
dif.r <- mean(r[Z.sim==1]) - mean(r[Z.sim==0])
Tdif.r <- abs(dif.r)
Tdif.dist.r <- c(Tdif.dist.r, Tdif.r)
p.r <- p.r + 1*(Tdif.r >= Tobs.dif.r)
}
p.ave <- p.ave/K
p.r <- p.r/K
c(Tobs.dif.ave, Tobs.dif.r, p.ave, p.r)
[1] 0.1360192 21.5392939 0.0900000 0.0756000
Under the null hypothesis of no effect of the program, having a p-value of 0.09 and 0.08 make no sufficient evidence to strongly reject the null hypothesis.
par(mfrow=c(1,2))
hist(Tdif.ave.dist, freq=TRUE, main="Abs diff in average outcomes",
breaks=100,
xlab=NULL)
abline(v=Tobs.dif.ave, col="forestgreen",lwd=2)
hist(Tdif.dist.r, freq=TRUE, main="Abs diff in average ranks",
breaks=100,
xlab=NULL)
abline(v=Tobs.dif.r, col="forestgreen",lwd=2)
### Interval estimates based on FEP (simulated p-values)
# Fisher interval for a common additive effect
# H0: Y(1) = Y(0) + tau
tau <- seq(-.5, .5, by=.05)
p.dif <- rep(0, length(tau))
Tobs.dif <- NULL
for(k in 1:K){
Z.sim <- sample(df_jobs$Z, N, replace=FALSE)
for(j in 1:length(tau)){
#Imputed df_jobs under the null hypothesis
Y0 <- df_jobs$depress6*(df_jobs$Z==0) + (df_jobs$depress6-tau[j])*(df_jobs$Z==1)
Y1 <- df_jobs$depress6*(df_jobs$Z==1) + (df_jobs$depress6+tau[j])*(df_jobs$Z==0)
Tobs.dif[j] <- abs(mean(df_jobs$depress6[df_jobs$Z==1]) - mean(df_jobs$depress6[df_jobs$Z==0]) - tau[j])
Tdif <- abs(mean(Y1[Z.sim==1]) - mean(Y0[Z.sim==0]) - tau[j])
p.dif[j]<- p.dif[j] + 1 * (Tdif>=Tobs.dif[j])
}
}
p.dif<- p.dif/K
FCI<-cbind(tau, Tobs.dif, p.dif)
FCI
tau Tobs.dif p.dif
[1,] -0.50 0.3639808 0.0000
[2,] -0.45 0.3139808 0.0000
[3,] -0.40 0.2639808 0.0012
[4,] -0.35 0.2139808 0.0086
[5,] -0.30 0.1639808 0.0438
[6,] -0.25 0.1139808 0.1464
[7,] -0.20 0.0639808 0.4164
[8,] -0.15 0.0139808 0.8670
[9,] -0.10 0.0360192 0.6568
[10,] -0.05 0.0860192 0.2756
[11,] 0.00 0.1360192 0.0880
[12,] 0.05 0.1860192 0.0204
[13,] 0.10 0.2360192 0.0034
[14,] 0.15 0.2860192 0.0006
[15,] 0.20 0.3360192 0.0000
[16,] 0.25 0.3860192 0.0000
[17,] 0.30 0.4360192 0.0000
[18,] 0.35 0.4860192 0.0000
[19,] 0.40 0.5360192 0.0000
[20,] 0.45 0.5860192 0.0000
[21,] 0.50 0.6360192 0.0000
# I will select p-values greater than 0.1, to find a confidence interval, since I want a 90% Fisher interval for a constant additive treatment, I will take the interval where values are greater than 0.1. so:
# -0.2, -0.1 or -0.25, -0.05 with -
#
#For taus between .6 and 3 we cannot reject the the null hypothesis
#H0: Y(1) - Y(0) = tau, i.e., the observed test statistic is not so extreme to
#reject the hypothesis of a treatment effect equal to tau. E.g., we cannot
#reject the hypothesis of a trt effect equal to 3, but we do reject the
#hypothesis of trt effect equal to 3.1 since the relative p-value is <0.05.
Confidence interval: I seek a 90% Fisher interval for a constant additive treatment. Considering the test statistics performed, it’s not so extreme to rejected the null hypothesis in the interval where p-values are greater than 0.1, so the corresponding taus range is [-0.25, -0.05].
##--------------------------------------------#
## Neyman's Repeated Sampling Approach
##--------------------------------------------#
Yobs <- df_jobs$depress6
Z <- df_jobs$Z
tau.dif<- mean(Yobs[Z==1]) - mean(Yobs[Z==0])
tau.dif
[1] -0.1360192
Avarage treatment effect.
# Estimates of the sample variance of the potential control outcome
s2.c<- var(Yobs[Z==0])
s2.c
[1] 0.6196765
# Estimates of the sample variance of the potential treated outcome
s2.t<- var(Yobs[Z==1])
s2.t
[1] 0.5485401
## Estimates of the variance of the treatment effect estimators
Vneyman <- s2.c/Nc + s2.t/Nt
Vneyman
[1] 0.006813534
## Confidence intervals:
# 1-alpha = 0.9, so 1 - (alpha/2) = 0.95
X <- qnorm(0.95)
c(tau.dif - X * sqrt(Vneyman), tau.dif + X * sqrt(Vneyman))
[1] -0.2717922196 -0.0002461851
The confidence interval found is similar to the one computed by Fisher (above).
The negative values in the interval indicate that there is a statistically significant decrease in the outcome variable associated with the treatment based on Neyman’s method.
## Testing
## Hypotheses:
# H0: tau.dif = 0 vs
# H1: tau.dif != 0
tneyman <- (tau.dif-0)/sqrt(Vneyman) # test statistic for the Neyman test
tneyman
[1] -1.647836
# p-value based on the normal approximation
2*(1-pnorm(abs(tneyman))) # gives the cumulative distribution function (CDF) of the standard normal distribution
[1] 0.09938631
Given a p-value of 0.09938631, it suggests that the observed estimate is not significantly different from zero at a conventional significance level of 0.05. However, we’re interested in constructing a 90% confidence interval, the fact that the interval does not include zero would suggest that we might slightly reject the null hypothesis in favor of alternative one of non-zero-effect.
Derive the posterior distributions of the finite sample average causal effect and the super-population average causal effect. Plot the resulting posterior distributions in a histogram and report the following summary statistics of the resulting posterior distributions: mean, standard deviation, median, 2.5% and 97.5% percentiles.
mcmc.m5 <- function(niter, nburn, thin=1,
par.prior,
Outcome.obs, W,
seed=NULL, theta.start=NULL, cred.level=0.95){
Yobs <- log(Outcome.obs) # logarithm transformation
Nc<- sum(1-W); Nt<- sum(W); N<- Nt+Nc
yobs.c<-mean(Yobs[W==0]); yobs.t<-mean(Yobs[W==1])
draws<- seq((nburn+1), niter, by=thin)
ndraws<- length(draws)
j <- 0 # Counter j=1...ndraws
# Start values
if(is.null(theta.start)==TRUE){
theta <- list(beta.c = mean(Yobs[W==0]) + rnorm(1,0, 0.1),
beta.t = mean(Yobs[W==1]) + rnorm(1,0, 0.1),
sigma2.c = var(Yobs[W==0]) + rnorm(1,0, 0.01),
sigma2.t = var(Yobs[W==1]) + rnorm(1,0, 0.01)
)
}else{theta<- theta.start}
Theta <- matrix(NA, ndraws, length(unlist(theta)) )
colnames(Theta) <- names(theta)
Estimands<- matrix(NA, ndraws, 2)
colnames(Estimands)<- c("ate.fs", "ate.sp")
if(is.null(seed)==FALSE){
set.seed(seed)}
for(l in 1:niter){
##Update beta.c
tau2.c.obs <- 1/{Nc/theta$sigma2.c + 1/par.prior$tau2.c}
nu.c.obs <- tau2.c.obs*{(yobs.c*Nc)/theta$sigma2.c + par.prior$nu.c/par.prior$tau2.c}
theta$beta.c <- rnorm(1, nu.c.obs, sqrt(tau2.c.obs))
##Update beta.t
tau2.t.obs <- 1/{Nt/theta$sigma2.t+ 1/par.prior$tau2.t}
nu.t.obs <- tau2.t.obs*{(yobs.t*Nt)/theta$sigma2.t + par.prior$nu.t/par.prior$tau2.t}
theta$beta.t <- rnorm(1, nu.t.obs, sqrt(tau2.t.obs))
##Update sigma2.c
a.c.obs <- Nc + par.prior$a.c
b2.c.obs <- {par.prior$a.c*par.prior$b2.c + sum({Yobs[W==0]-theta$beta.c}^2)}/a.c.obs
theta$sigma2.c <- {a.c.obs*b2.c.obs}/rchisq(1, a.c.obs)
##Update sigma2.t
a.t.obs <- Nt + par.prior$a.t
b2.t.obs <- {par.prior$a.t*par.prior$b2.t + sum({Yobs[W==1]-theta$beta.t}^2)}/a.t.obs
theta$sigma2.t <- {a.t.obs*b2.t.obs}/rchisq(1, a.t.obs)
rm(tau2.c.obs, nu.c.obs,tau2.t.obs, nu.t.obs, a.c.obs, b2.c.obs, a.t.obs, b2.t.obs)
if(sum(l == draws)==1){
j <- j+1
Theta[j,]<- unlist(theta)
# Imputate the missing potential outcomes using Ymis | Yobs, W, X, theta
Y0<-Y1<-NULL
Y0[W==0]<- Outcome.obs[W==0] # values with no log transf
Y0[W==1]<- exp(rnorm(Nt, theta$beta.c, sqrt(theta$sigma2.c))) # inverse transf
Y1[W==0]<- exp(rnorm(Nc, theta$beta.t, sqrt(theta$sigma2.t))) # inverse transf
Y1[W==1]<- Outcome.obs[W==1] # values with no log transf
Estimands[j,"ate.fs"] <- mean(Y1) - mean(Y0)
Estimands[j,"ate.sp"] <- exp(theta$beta.t + 0.5*theta$sigma2.t)-exp(theta$beta.c +0.5*theta$sigma2.c) # HINT
}
}
# Sim posterior distrib of ATE.FS and ATE.SP
probs<-c((1-cred.level)/2,1-(1-cred.level)/2) # hint
est<-round(cbind(apply(Estimands,2,mean),
apply(Estimands,2,sd),
apply(Estimands,2,median),
apply(Estimands,2,function(x) quantile(x,probs[1])),
apply(Estimands,2,function(x) quantile(x,probs[2]))),4)
colnames(est)<-c("Mean"," sd"," Median"," CI low"," CI up")
print(est)
parms<-round(cbind(apply(Theta,2,mean),
apply(Theta,2,sd)),4)
colnames(parms)<-c("Mean"," sd")
print(parms)
return(list(Theta=Theta, Estimands=Estimands))}
par.prior <- list(nu.c=0, nu.t=0, tau2.c=100^2, tau2.t=100^2, a.c=2, b2.c=0.01, a.t=2, b2.t=0.01)
chain.mB<-mcmc.m5(niter=25000, nburn=5000, thin=1, par.prior,
Outcome.obs=df_jobs$depress6, W=df_jobs$Z, seed=2022, theta.start=NULL)
Mean sd Median CI low CI up
ate.fs -0.1362 0.0619 -0.1350 -0.2621 -0.0178
ate.sp -0.1363 0.0821 -0.1354 -0.3007 0.0219
Mean sd
beta.c 0.7022 0.0309
beta.t 0.6352 0.0218
sigma2.c 0.1231 0.0154
sigma2.t 0.1265 0.0110
## Overlapping histograms of the simulated posterior distribution of ATE.FS
hist(chain.mB$Estimands[,"ate.fs"], freq=FALSE, breaks = 20,
main = "Average Treatment Effect",
xlab="",ylab="",
cex.lab=2.5, density=30, col="#FC4E07")
hist(chain.mB$Estimands[,"ate.sp"], freq=FALSE, breaks=20,add=TRUE, col="#00AFBB", density=20)
legend("topright", cex=c(0.8,0.8,0.8), lty=c(1,1,1), col= c("#FC4E07", "#00AFBB"), legend=c("ATE.FS","ATE.SP"))
Derive the posterior distributions of the finite sample average causal effect and the super-population average causal effect. Plot the resulting posterior distributions in a histogram and report the following summary statistics of the resulting posterior distributions: mean, standard deviation, median, 2.5% and 97.5% percentiles.
Compare the results with those obtained without condition on the covariates
mcmc.m6 <- function(niter, nburn, thin=1,
par.prior, Outcome.obs, W, X,
seed=NULL, theta.start=NULL, cred.level = 0.95){
Yobs <- log(Outcome.obs) # log transf
Nc<- sum(1-W); Nt<- sum(W); N<- Nt+Nc
XX <- as.matrix(cbind(1,X))
nxx<-ncol(XX)
draws<- seq((nburn+1), niter, by=thin)
ndraws<- length(draws)
j <- 0
# Start values
lm.w0<- summary(lm(Yobs[W==0] ~ X[W==0,]))
lm.w1<- summary(lm(Yobs[W==1] ~ X[W==1,]))
if(is.null(theta.start)==TRUE){
theta <- list(beta.c = as.numeric(lm.w0$coefficients[,1]) + rnorm(nxx,0, 0.1),
beta.t = as.numeric(lm.w1$coefficients[,1]) + rnorm(nxx,0, 0.1),
sigma2.c = as.numeric(lm.w0$sigma^2) + rnorm(1,0, 0.01),
sigma2.t = as.numeric(lm.w1$sigma^2) + rnorm(1,0, 0.01))
}else{
theta<- theta.start
}
Theta <- matrix(NA, ndraws, length(unlist(theta)) )
colnames(Theta) <- names(unlist(theta))
Estimands<- matrix(NA, ndraws, 2)
colnames(Estimands)<- c("ate.fs", "ate.sp")
if(is.null(seed)==FALSE){
set.seed(seed)}
for(l in 1:niter){
# Update beta.c
Omega.c.obs <- solve(solve(par.prior$Omega.c) + t(XX[W==0,])%*%XX[W==0,]/theta$sigma2.c)
nu.c.obs <- Omega.c.obs%*%(solve(par.prior$Omega.c)%*%par.prior$nu.c + t(XX[W==0,])%*%Yobs[W==0]/theta$sigma2.c)
theta$beta.c <- as.numeric(rmvnorm(1, mean= nu.c.obs, sigma=Omega.c.obs))
# Update beta.t
Omega.t.obs <- solve(solve(par.prior$Omega.t) + t(XX[W==1,])%*%XX[W==1,]/theta$sigma2.t)
nu.t.obs <- Omega.t.obs%*%(solve(par.prior$Omega.t)%*%par.prior$nu.t + t(XX[W==1,])%*%Yobs[W==1]/theta$sigma2.t)
theta$beta.t <- as.numeric(rmvnorm(1, mean= nu.t.obs, sigma=Omega.t.obs))
# Update sigma2.c
a.c.obs <- Nc + par.prior$a.c
b2.c.obs <- {par.prior$a.c*par.prior$b2.c + sum({Yobs[W==0]-XX[W==0,]%*%theta$beta.c}^2)}/a.c.obs
theta$sigma2.c <- {a.c.obs*b2.c.obs}/rchisq(1, a.c.obs)
# Update sigma2.t
a.t.obs <- Nt + par.prior$a.t
b2.t.obs <- {par.prior$a.t*par.prior$b2.t + sum({Yobs[W==1]-XX[W==1,]%*%theta$beta.t}^2)}/a.t.obs
theta$sigma2.t <- {a.t.obs*b2.t.obs}/rchisq(1, a.t.obs)
rm(Omega.c.obs, nu.c.obs, Omega.t.obs, nu.t.obs, a.c.obs, b2.c.obs, a.t.obs, b2.t.obs)
if(sum(l == draws)==1){
j <- j+1
Theta[j,]<- unlist(theta)
##Imputate the missing potential outcomes using Ymis | Yobs, W, X, theta
Y0<-Y1<-NULL
Y0[W==0]<- Outcome.obs[W==0] # no log transf
Y0[W==1]<- exp(rnorm(Nt, XX[W==1,]%*%theta$beta.c, sqrt(theta$sigma2.c))) # inverse trans
Y1[W==0]<- exp(rnorm(Nc, XX[W==0,]%*%theta$beta.t, sqrt(theta$sigma2.t))) # inverse trans
Y1[W==1]<- Outcome.obs[W==1] # no log transf
Estimands[j,"ate.fs"] <- mean(Y1)-mean(Y0)
Estimands[j,"ate.sp"] <- mean(exp(XX%*%theta$beta.t + 0.5*theta$sigma2.t)) - mean(exp(XX%*%theta$beta.c + 0.5*theta$sigma2.c))
}
}
# Summary statistics of the simulated posterior distribution of ATE.FS and ATE.SP
probs<-c((1-cred.level)/2,1-(1-cred.level)/2)
est<-round(cbind(apply(Estimands,2,mean),
apply(Estimands,2,sd),
apply(Estimands,2,median),
apply(Estimands,2,function(x) quantile(x,probs[1])),
apply(Estimands,2,function(x) quantile(x,probs[2]))),4)
colnames(est)<-c("Mean"," sd"," Median"," CI low"," CI up")
print(est)
parms<-round(cbind(apply(Theta,2,mean),
apply(Theta,2,sd)),4)
colnames(parms)<-c("Mean"," sd")
print(parms)
return(list(Theta=Theta, Estimands=Estimands))
}
X <- as.matrix(df_jobs[, c("sex","age","race","nonmarried","educ","EconHard","assertive","motivation","depress0")])
ncov<- ncol(X)
par.prior <- list(nu.c=rep(0, {ncov+1}), Omega.c=diag(100^2,{ncov+1}),
nu.t=rep(0, {ncov+1}), Omega.t=diag(100^2,{ncov+1}),
a.c=2, b2.c=0.01, a.t=2, b2.t=0.01)
chain.mC<-mcmc.m6(niter=20000, nburn=5000, thin=1,
par.prior,
Outcome.obs=Yobs, W=Z, X=X,
seed=2022, theta.start=NULL)
Mean sd Median CI low CI up
ate.fs -0.1864 0.0670 -0.1843 -0.3227 -0.0614
ate.sp -0.1876 0.0863 -0.1846 -0.3640 -0.0261
Mean sd
beta.c1 0.0984 0.4469
beta.c2 -0.0182 0.0670
beta.c3 0.0023 0.0033
beta.c4 0.1368 0.0950
beta.c5 -0.0540 0.0655
beta.c6 0.0034 0.0164
beta.c7 0.0674 0.0389
beta.c8 -0.0288 0.0412
beta.c9 0.0543 0.0398
beta.c10 0.0261 0.1114
beta.t1 0.6888 0.2978
beta.t2 0.0320 0.0439
beta.t3 0.0006 0.0023
beta.t4 0.0546 0.0552
beta.t5 -0.0278 0.0465
beta.t6 -0.0123 0.0108
beta.t7 0.0758 0.0280
beta.t8 -0.0293 0.0247
beta.t9 -0.0525 0.0266
beta.t10 0.0692 0.0724
sigma2.c 0.1220 0.0159
sigma2.t 0.1237 0.0109
## Overlapping histograms of the simulated posterior distribution of ATE.FS
hist(chain.mC$Estimands[,"ate.fs"], freq=FALSE, breaks = 20,
main = "Average Treatment Effect",
xlab="",ylab="",
cex.lab=2.5, density=30, col="#FC4E07")
hist(chain.mC$Estimands[,"ate.sp"], freq=FALSE, breaks=20,add=TRUE, col="#00AFBB", density=20)
legend("topright", cex=c(0.8,0.8,0.8), lty=c(1,1,1), col= c("#FC4E07", "#00AFBB"), legend=c("ATE.FS","ATE.SP"))
## Overlapping histograms of the simulated posterior distribution of ATE.FS
hist(chain.mB$Estimands[,"ate.fs"], freq=FALSE, breaks = 20,
main = "Finite-Sample Average Treatment Effect",
xlab="",ylab="",
cex.lab=2.5, density=30, col="#FC4E07") # "#FC4E07", "#00AFBB"
hist(chain.mC$Estimands[,"ate.fs"], freq=FALSE, breaks=20,add=TRUE, col="#00AFBB", density=20)
legend("topright", cex=c(0.8,0.8,0.8), lty=c(1,1,1), col= c("#FC4E07", "#00AFBB"), legend=c("Model B","Model C"))
## Overlapping histograms of the simulated posterior distribution of ATE.PS
hist(chain.mB$Estimands[,"ate.sp"], freq=FALSE, breaks = 20,
main = "Super-Population Average Treatment Effect",
xlab="",ylab="",
cex.lab=2.5, density=30, col="#FC4E07") # "#FC4E07", "#00AFBB"
hist(chain.mC$Estimands[,"ate.sp"], freq=FALSE, breaks=20,add=TRUE, col="#00AFBB", density=20)
legend("topright", cex=c(0.8,0.8,0.8), lty=c(1,1,1), col= c("#FC4E07", "#00AFBB"), legend=c("Model B","Model C"))
Looking at the that distributions, going from the model B without condition on covariates to model C with it, this seem to have an effect on the outcome, considering, e.g. that the curve are shifted to the left.
library(ggplot2)
library(MatchIt)
# import file
df_lux_all <- read.table('TrainingLux.txt')
df_lux_all
# df withou Outcome
df_lux <- df_lux_all[, !(names(df_lux_all) == "Outcome")]
df_lux
# Covariates
X <- df_lux[, !(names(df_lux) == "TREAT")]
X
std_diff <- function(x, treat){
### Param checks
# Check for NAs in x
if(any(is.na(x))){warning("NAs removed, check x and think of other options")}
# Check for NAs in treat
if(any(is.na(treat))){stop("treatment indicator 'treat' is not supposed to contain NA, check it!")}
# Mean
mean <- mean(x, na.rm = TRUE)
### Mean of 'x' among treated and control units
mean.t <- mean(x[treat == 1], na.rm = TRUE)
mean.c <- mean(x[treat == 0], na.rm = TRUE)
### Variance of 'x' among treated and control units
var.t <- var(x[treat == 1], na.rm = TRUE)
var.c <- var(x[treat == 0], na.rm = TRUE)
### Standardized difference in means
std.diff <- (mean.t - mean.c) / sqrt((var.t + var.c)/2)
### Returning results
res <- c(Mean = mean, Mean.t = mean.t, Mean.c = mean.c, Sd.t = sqrt(var.t), Sd.c = sqrt(var.c), Std.Mean.Diff = std.diff)
return(res)
}
t(sapply(X, std_diff, treat = df_lux$TREAT, simplify = TRUE))
Mean Mean.t Mean.c Sd.t Sd.c Std.Mean.Diff
age 34.51204676 36.82580645 34.43507987 9.0586485 11.5128683 0.23079412
gender 0.47691532 0.64884793 0.47119600 0.4775505 0.4991773 0.36368043
married 0.42436651 0.55576037 0.41999571 0.4971102 0.4935654 0.27408325
natio1 0.29968548 0.05529954 0.30781494 0.2286694 0.4615966 -0.69324097
natio2 0.29989318 0.27004608 0.30088604 0.4441881 0.4586503 -0.06830902
natio3 0.19672423 0.27373272 0.19416255 0.4460790 0.3955607 0.18874397
natio4 0.08417898 0.18709677 0.08075543 0.3901690 0.2724633 0.31601927
natio5 0.11951813 0.21382488 0.11638103 0.4101937 0.3206862 0.26467071
educ1 0.27256543 0.27741935 0.27240396 0.4479317 0.4452035 0.01123093
educ2 0.48991158 0.34009217 0.49489530 0.4739584 0.4999816 -0.31777709
educ3 0.23752300 0.38248848 0.23270074 0.4862190 0.4225596 0.32884047
EmployLevel1 0.17266038 0.09677419 0.17518472 0.2957864 0.3801309 -0.23022667
EmployLevel2 0.39950151 0.43778802 0.39822792 0.4963434 0.4895404 0.08025116
EmployLevel3 0.33487627 0.37972350 0.33338443 0.4855418 0.4714298 0.09683471
EmployLevel4 0.07735446 0.07465438 0.07744428 0.2629540 0.2672992 -0.01052254
EmployLevel5 0.01560738 0.01105991 0.01575865 0.1046311 0.1245423 -0.04085212
skill 0.95178328 0.94746544 0.95192691 0.2232053 0.2139240 -0.02040800
sector0 0.32959468 0.15023041 0.33556121 0.3574619 0.4721935 -0.44255413
sector1 0.29915139 0.36405530 0.29699237 0.4813861 0.4569402 0.14289312
sector2 0.10438550 0.17419355 0.10206334 0.3794507 0.3027362 0.21014302
sector3 0.14025874 0.16774194 0.13934451 0.3738093 0.3463110 0.07881112
sector4 0.12660970 0.14377880 0.12603857 0.3510271 0.3318979 0.05193331
work_12 0.64289953 0.75391705 0.63920655 0.4309261 0.4802381 0.25142095
nb_mthbf_12 6.39051095 4.97050691 6.43774719 3.6329590 5.4659510 -0.31615753
Compare graphically the distributions of estimated propensity scores within the treatment groups and explain what you see.
# propensity score
mod <- glm(TREAT~., data=df_lux, family=binomial(link=logit))
summary(mod)
Call:
glm(formula = TREAT ~ ., family = binomial(link = logit), data = df_lux)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.5393 -0.2453 -0.1609 -0.0911 3.7833
Coefficients: (4 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.703582 0.409327 -4.162 3.16e-05 ***
age 0.020601 0.003525 5.844 5.09e-09 ***
gender 0.731334 0.070656 10.351 < 2e-16 ***
married 0.229798 0.073204 3.139 0.001695 **
natio1 -2.170796 0.155755 -13.937 < 2e-16 ***
natio2 -0.659103 0.100443 -6.562 5.31e-11 ***
natio3 -0.434414 0.102592 -4.234 2.29e-05 ***
natio4 -0.021194 0.111200 -0.191 0.848847
natio5 NA NA NA NA
educ1 -0.461297 0.086224 -5.350 8.80e-08 ***
educ2 -0.983716 0.093343 -10.539 < 2e-16 ***
educ3 NA NA NA NA
EmployLevel1 -0.586041 0.326075 -1.797 0.072294 .
EmployLevel2 -0.145899 0.311767 -0.468 0.639803
EmployLevel3 -0.058455 0.310292 -0.188 0.850573
EmployLevel4 0.070019 0.324800 0.216 0.829320
EmployLevel5 NA NA NA NA
skill -0.295697 0.149766 -1.974 0.048337 *
sector0 -2.309417 0.157679 -14.646 < 2e-16 ***
sector1 0.368923 0.105600 3.494 0.000477 ***
sector2 0.257943 0.120069 2.148 0.031691 *
sector3 -0.436960 0.120721 -3.620 0.000295 ***
sector4 NA NA NA NA
work_12 0.959554 0.136134 7.049 1.81e-12 ***
nb_mthbf_12 -0.263125 0.009887 -26.614 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 9590.8 on 33701 degrees of freedom
Residual deviance: 7702.0 on 33681 degrees of freedom
AIC: 7744
Number of Fisher Scoring iterations: 8
pscores <- mod$fitted.values
std_diff(pscores, treat = df_lux$TREAT)
Mean Mean.t Mean.c Sd.t Sd.c Std.Mean.Diff
0.03219394 0.11820900 0.02933266 0.11267286 0.05052386 1.01788126
# dens overlap
density_overlap <- function(x, treat, alpha = 0.25){
### Formatting data
data <- data.frame(Legend = c(rep("Treated", sum(treat)), rep("Controls", sum(treat == 0))), Value = c(x[treat ==1], x[treat == 0]))
### Calling 'ggplot'
ggplot(data, aes(x = Value, fill = Legend)) + geom_density(alpha = alpha)
}
# Hist overlap
hist_overlap <- function(x, treat, alpha = 0.5, ...){
### Formatting data
data <- data.frame(Legend = c(rep("Treated", sum(treat)), rep("Controls", sum(treat == 0))), Value = c(x[treat ==1], x[treat == 0]))
### Calling 'ggplot'
ggplot(data, aes(x = Value, fill = Legend, after_stat(density))) + geom_histogram(alpha = alpha, position = "identity", ...)
}
density_overlap(x = pscores, treat = df_lux$TREAT)
hist_overlap(x = pscores, treat = df_lux$TREAT, bins = 20)
The non-treated distribution is more condensed to the origin, showing a significant difference on distributions of the treated and control groups.
We already know that in this observational study, so in this dataset, subjects were not randomly assigned to treatment and this graph underline this statment. The treatment assignment have been affected by specific values of units’ covariates, in a preferential way considering some characteristics, showing a lack of randomness.
Propensity score is useful for achieving covariate balance in observational studies, since the lack of balance could influence the reliability of causal effect estimates, as the treated and control groups may not be comparable with respect to observed covariates.
The propensity score serves as a summary measure of the covariates, and how the distribution of X appears balanced between treated and control groups.
The propensity score is the coarsest balancing score, it is a function of every balancing score, so it provides the biggest benefit in terms of reducing the number of variables we need to adjust for.
Its advantage lies in significantly reducing the number of variables requiring adjustment.
How many units did you discard? Why is it important to discard these units?
### 2) TRIMMING
# We want to discard control units having PS lower than the min PS for the treated.
# We could also discard treated units for a common support but in this case we prefer to salvage all the n treated
by(pscores,df_lux$TREAT,summary)
df_lux$TREAT: 0
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000378 0.004740 0.013512 0.029333 0.030593 0.694177
---------------------------------------------------------------------------------------------------
df_lux$TREAT: 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0007796 0.0370989 0.0756885 0.1182090 0.1721260 0.6025573
mincut <- min(pscores[df_lux$TREAT==1])
maxcut <- max(pscores[df_lux$TREAT==1])
df_lux2 <- df_lux[pscores>=mincut & pscores<=maxcut,]
nrow(df_lux) - nrow(df_lux2) # number discarded
[1] 370
table(df_lux2$TREAT) # only controls discarded, as we wanted
0 1
32247 1085
discarded <- which(pscores < mincut | pscores > maxcut) # discarded units
#discarded
I’ve discarded 370 units. It’s important to discard these units to achieve common support between the treatment and control groups, ensuring that the distribution of estimated propensity scores overlaps for both groups. This helps in creating balanced groups and improve the lack of randomization. Removing units make the estimated treatment effects more reliable and reduce the potential for bias.
# 2.1) Standardized Diff. in Mean for the trimmed data
X.trim <- df_lux2[, !(names(df_lux2) == "TREAT")]
t(sapply(X.trim, std_diff, treat = df_lux2$TREAT, simplify = TRUE))
Mean Mean.t Mean.c Sd.t Sd.c Std.Mean.Diff
age 34.66965679 36.82580645 34.59710981 9.0586485 11.4667373 0.215685561
gender 0.48193928 0.64884793 0.47632338 0.4775505 0.4994468 0.353084345
married 0.42883715 0.55576037 0.42456663 0.4971102 0.4942847 0.264663876
natio1 0.29206168 0.05529954 0.30002791 0.2286694 0.4582769 -0.675762473
natio2 0.30322213 0.27004608 0.30433839 0.4441881 0.4601338 -0.075829142
natio3 0.19890796 0.27373272 0.19639036 0.4460790 0.3972733 0.183110105
natio4 0.08502340 0.18709677 0.08158899 0.3901690 0.2737418 0.313059906
natio5 0.12078483 0.21382488 0.11765436 0.4101937 0.3222034 0.260743540
educ1 0.27517101 0.27741935 0.27509536 0.4479317 0.4465692 0.005196181
educ2 0.48481939 0.34009217 0.48968896 0.4739584 0.4999014 -0.307115557
educ3 0.24000960 0.38248848 0.23521568 0.4862190 0.4241401 0.322799140
EmployLevel1 0.17037682 0.09677419 0.17285329 0.2957864 0.3781263 -0.224116604
EmployLevel2 0.39973599 0.43778802 0.39845567 0.4963434 0.4895878 0.079785340
EmployLevel3 0.33589344 0.37972350 0.33441871 0.4855418 0.4717942 0.094637885
EmployLevel4 0.07821313 0.07465438 0.07833287 0.2629540 0.2686988 -0.013837132
EmployLevel5 0.01578063 0.01105991 0.01593947 0.1046311 0.1252433 -0.042284478
skill 0.95139806 0.94746544 0.95153037 0.2232053 0.2147598 -0.018559388
sector0 0.32434297 0.15023041 0.33020126 0.3574619 0.4702927 -0.430856753
sector1 0.30220209 0.36405530 0.30012094 0.4813861 0.4583174 0.136032475
sector2 0.10554422 0.17419355 0.10323441 0.3794507 0.3042695 0.206324173
sector3 0.14034561 0.16774194 0.13942382 0.3738093 0.3463936 0.078582351
sector4 0.12756510 0.14377880 0.12701957 0.3510271 0.3330001 0.048984655
work_12 0.64784591 0.75391705 0.64427699 0.4309261 0.4787392 0.240723538
nb_mthbf_12 6.43693748 4.97050691 6.48627779 3.6329590 5.4559512 -0.327029654
# 2.2) Re-estimation of the propensity score
mod2 <- glm(TREAT~., data = df_lux2, family = binomial(link=logit))
summary(mod2)
Call:
glm(formula = TREAT ~ ., family = binomial(link = logit), data = df_lux2)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.3347 -0.2463 -0.1614 -0.0931 3.7869
Coefficients: (4 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.669433 0.409248 -4.079 4.52e-05 ***
age 0.020366 0.003528 5.773 7.80e-09 ***
gender 0.737173 0.070708 10.426 < 2e-16 ***
married 0.237653 0.073236 3.245 0.001174 **
natio1 -2.170740 0.155773 -13.935 < 2e-16 ***
natio2 -0.663417 0.100434 -6.605 3.96e-11 ***
natio3 -0.439488 0.102611 -4.283 1.84e-05 ***
natio4 -0.015758 0.111145 -0.142 0.887255
natio5 NA NA NA NA
educ1 -0.470894 0.086238 -5.460 4.75e-08 ***
educ2 -0.994196 0.093392 -10.645 < 2e-16 ***
educ3 NA NA NA NA
EmployLevel1 -0.593443 0.326219 -1.819 0.068887 .
EmployLevel2 -0.150265 0.311892 -0.482 0.629958
EmployLevel3 -0.058931 0.310390 -0.190 0.849418
EmployLevel4 0.066706 0.324933 0.205 0.837345
EmployLevel5 NA NA NA NA
skill -0.325093 0.149084 -2.181 0.029213 *
sector0 -2.307809 0.157831 -14.622 < 2e-16 ***
sector1 0.382290 0.105687 3.617 0.000298 ***
sector2 0.259421 0.120182 2.159 0.030883 *
sector3 -0.439174 0.120826 -3.635 0.000278 ***
sector4 NA NA NA NA
work_12 0.979482 0.136277 7.187 6.60e-13 ***
nb_mthbf_12 -0.265139 0.009910 -26.755 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 9566.4 on 33331 degrees of freedom
Residual deviance: 7690.7 on 33311 degrees of freedom
AIC: 7732.7
Number of Fisher Scoring iterations: 8
pscores2 <- mod2$fitted.values
# 2.3) Balance assessment (Standardized Diff. in Mean and graphical checks)
std_diff(pscores2, treat = df_lux2$TREAT)
Mean Mean.t Mean.c Sd.t Sd.c Std.Mean.Diff
0.03255130 0.11918544 0.02963636 0.11419756 0.05062333 1.01381995
hist_overlap(pscores2, treat = df_lux2$TREAT)
You are allowed to choose size and bounds of the subclasses. You are supposed to create the best subclasses based on your own reasoning. Create a table showing the number of treated and control units within each of the five subclasses.
Briefly comment and explain your choice. (Hint: look at the distribution of the propensity score in the two groups, check the overlap, check the balance within subclasses and the number of treated and control units, etc).
### 3) SUBCLASSIFICATION BASED ON THE PROPENSITY SCORE
# The goal is to create subclasses of treated and control units sharing
# similar values for the propensity score
# 3.1) Defining subclasses from the inspection of the quantiles table
quant.tab <- data.frame(Quantiles = seq(0,1, by=0.05),
PS.Controls = as.numeric(quantile(pscores2[df_lux2$TREAT==0],probs=seq(0,1, by=0.05))),
PS.Treated = as.numeric(quantile(pscores2[df_lux2$TREAT==1],probs=seq(0,1, by=0.05))),
PS.Whole = as.numeric(quantile(pscores2,probs=seq(0,1, by=0.05))))
breaks <- quantile(pscores2, c(.65, .80, .85, .9)) #, .95)) # .4, .75, .85, .9, .95))
bins <- rep(NA,nrow(df_lux2))
bins[pscores2<=breaks[1]] <- 1
bins[pscores2>breaks[1] & pscores2<=breaks[2]] <- 2
bins[pscores2>breaks[2] & pscores2<=breaks[3]] <- 3
bins[pscores2>breaks[3] & pscores2<=breaks[4]] <- 4
bins[pscores2>breaks[4]] <-5 # & pscores2<=breaks[5]] <- 5
#bins[pscores2>breaks[5]] <- 6
table(bins, df_lux2$TREAT) # Number of T and C in each subclass
bins 0 1
1 21518 148
2 4842 160
3 1579 88
4 1523 140
5 2785 549
table(bins) # Tot number of people in each subclass
bins
1 2 3 4 5
21666 5002 1667 1663 3334
# 3.2) Balance assessment within each block
# Standardized Diff. in Mean
mapply(1:length(unique(bins)), FUN = function(b)(std_diff(pscores2[bins == b], treat = df_lux2$TREAT[bins == b])))
[,1] [,2] [,3] [,4] [,5]
Mean 0.008919212 0.030725485 0.046469389 0.062117368 0.16715712
Mean.t 0.014628061 0.031783538 0.046538920 0.063134609 0.19878253
Mean.c 0.008879947 0.030690522 0.046465514 0.062023859 0.16092288
Sd.t 0.005907308 0.005312641 0.003356584 0.006169522 0.11219332
Sd.c 0.006383441 0.005063838 0.003501418 0.006067369 0.08937412
Std.Mean.Diff 0.934655235 0.210611243 0.021402553 0.181534864 0.37326812
# mapply(X.trim, FUN = function(x)(std_diff_block(x, treat = df_lux2$TREAT, blocks = bins, weights = "total")))
# Graphical balance assessment
density_overlap <- function(x, treat, alpha = 0.25){
### Formatting data
data <- data.frame(Legend = c(rep("Treated", sum(treat)), rep("Controls", sum(treat == 0))), Value = c(x[treat ==1], x[treat == 0]))
### Calling 'ggplot'
ggplot(data, aes(x = Value, fill = Legend)) + geom_density(alpha = alpha)
}
mapply(unique(bins)[order(unique(bins))],
FUN = function(B)(density_overlap(x = pscores2[bins == B], treat = df_lux2$TREAT[bins == B]) +
ggtitle(paste("Overlap, block", B))), SIMPLIFY = FALSE)
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
After some attemps I reached this as my best fitting overlap between Trated and Control. The first block has a significant area of non-overlapping, but the other blocks show a better situation. That’s underlined also in the stat table where: Mean.t = 0.014628061, Mean.c = 0.008879947 in case of block 1, showing a significant difference. Similarly, in block 5, but still with a good fitting.
Now that the study design phase is complete, read in the outcome, Outcome.
Naively pretending that this observational study was actually a completely randomized experiment, and ignoring covariates, calculate a Neyman estimate of the average treatment effect and a large sample 95% interval and enter both in a table.
Calculate the naive Neyman estimate as in the previous point, but on the subset of units obtained in 5(a).
Report it in the table and comment briefly.
# analysis
df_lux$Outcome <- df_lux_all$Outcome
df_lux
df_lux2$Outcome <- df_lux_all$Outcome[-discarded]
df_lux2
neyman <- function(outcome, treat, alpha){
# Settings
Yt <- outcome[treat == 1]
Yc <- outcome[treat == 0]
Nt <- length(Yt)
Nc <- length(Yc)
# ATE
ate <- mean(Yt) - mean(Yc)
# Variance estimator
Var.t <- var(Yt)
Var.c <- var(Yc)
Var <- Var.t / Nt + Var.c / Nc
# Exporting results
res <- cbind(ATE = ate, Var = Var, int.lower = ate - sqrt(Var)*qnorm(1-alpha/2), int.upper = ate + sqrt(Var)*qnorm(1-alpha/2))
return(res)
}
### 2) ATE, Naive Neyman (i.e., Neyman as if dataset was randomized)
ney.naive <- neyman(outcome = df_lux$Outcome, treat = df_lux$TREAT, alpha = 0.05)
ney.naive
ATE Var int.lower int.upper
[1,] -0.02727316 0.0002374124 -0.05747266 0.002926344
### 3) ATE, Neyman on the trimmed subset
ney.trimm <- neyman(outcome = df_lux2$Outcome, treat = df_lux2$TREAT, alpha = 0.05)
ney.trimm
ATE Var int.lower int.upper
[1,] -0.02608443 0.0002375001 -0.05628951 0.004120645
ney <- rbind(ney.naive,ney.trimm)
rownames(ney) <- c("ATE Naive Neyman", "ATE Naive Neyman on trimmed subset")
round(ney,2)
ATE Var int.lower int.upper
ATE Naive Neyman -0.03 0 -0.06 0
ATE Naive Neyman on trimmed subset -0.03 0 -0.06 0
The interval on the trimmed subset is slightly narrower.
Please briefly present the method and explain your preference and discuss results you have obtained.
### 5) Estimating ATT on matched data
# 4.2) Non-Exact Matching: Nearest Neighbors with Propensity Score
# Without replacement, discarding control units
# outside the support of the distance measure of the treated units
m.nn1 <- matchit(TREAT~ ., data = df_lux, method = "nearest", discard='control', distance = "glm")
m.nn1
A matchit object
- method: 1:1 nearest neighbor matching without replacement
- distance: Propensity score [common support]
- estimated with logistic regression
- common support: control units dropped
- number of obs.: 33702 (original), 2170 (matched)
- target estimand: ATT
- covariates: age, gender, married, natio1, natio2, natio3, natio4, natio5, educ1, educ2, educ3, EmployLevel1, EmployLevel2, EmployLevel3, EmployLevel4, EmployLevel5, skill, sector0, sector1, sector2, sector3, sector4, work_12, nb_mthbf_12, Outcome
summary(m.nn1) ## how many units are matched? --> "THE" nearest neighbor
Call:
matchit(formula = TREAT ~ ., data = df_lux, method = "nearest",
distance = "glm", discard = "control")
Summary of Balance for All Data:
Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean eCDF Max
distance 0.1185 0.0293 0.7892 4.9882 0.3765 0.5563
age 36.8258 34.4351 0.2639 0.6191 0.0582 0.1921
gender 0.6488 0.4712 0.3722 . 0.1777 0.1777
married 0.5558 0.4200 0.2732 . 0.1358 0.1358
natio1 0.0553 0.3078 -1.1048 . 0.2525 0.2525
natio2 0.2700 0.3009 -0.0695 . 0.0308 0.0308
natio3 0.2737 0.1942 0.1785 . 0.0796 0.0796
natio4 0.1871 0.0808 0.2727 . 0.1063 0.1063
natio5 0.2138 0.1164 0.2377 . 0.0974 0.0974
educ1 0.2774 0.2724 0.0112 . 0.0050 0.0050
educ2 0.3401 0.4949 -0.3268 . 0.1548 0.1548
educ3 0.3825 0.2327 0.3082 . 0.1498 0.1498
EmployLevel1 0.0968 0.1752 -0.2652 . 0.0784 0.0784
EmployLevel2 0.4378 0.3982 0.0797 . 0.0396 0.0396
EmployLevel3 0.3797 0.3334 0.0955 . 0.0463 0.0463
EmployLevel4 0.0747 0.0774 -0.0106 . 0.0028 0.0028
EmployLevel5 0.0111 0.0158 -0.0449 . 0.0047 0.0047
skill 0.9475 0.9519 -0.0200 . 0.0045 0.0045
sector0 0.1502 0.3356 -0.5187 . 0.1853 0.1853
sector1 0.3641 0.2970 0.1394 . 0.0671 0.0671
sector2 0.1742 0.1021 0.1902 . 0.0721 0.0721
sector3 0.1677 0.1393 0.0760 . 0.0284 0.0284
sector4 0.1438 0.1260 0.0506 . 0.0177 0.0177
work_12 0.7539 0.6392 0.2663 . 0.1147 0.1147
nb_mthbf_12 4.9705 6.4377 -0.4039 0.4418 0.1709 0.3960
Outcome 0.4691 0.4964 -0.0547 . 0.0273 0.0273
Summary of Balance for Matched Data:
Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean eCDF Max Std. Pair Dist.
distance 0.1185 0.1184 0.0011 1.0076 0.0000 0.0028 0.0015
age 36.8258 36.8313 -0.0006 0.6900 0.0323 0.0802 1.1654
gender 0.6488 0.6581 -0.0193 . 0.0092 0.0092 0.8264
married 0.5558 0.5438 0.0241 . 0.0120 0.0120 0.8774
natio1 0.0553 0.0470 0.0363 . 0.0083 0.0083 0.3105
natio2 0.2700 0.2654 0.0104 . 0.0046 0.0046 0.7619
natio3 0.2737 0.2719 0.0041 . 0.0018 0.0018 0.7814
natio4 0.1871 0.2129 -0.0662 . 0.0258 0.0258 0.6759
natio5 0.2138 0.2028 0.0270 . 0.0111 0.0111 0.6834
educ1 0.2774 0.2765 0.0021 . 0.0009 0.0009 0.8419
educ2 0.3401 0.3392 0.0019 . 0.0009 0.0009 0.8424
educ3 0.3825 0.3843 -0.0038 . 0.0018 0.0018 0.8268
EmployLevel1 0.0968 0.1097 -0.0436 . 0.0129 0.0129 0.5611
EmployLevel2 0.4378 0.4157 0.0446 . 0.0221 0.0221 0.8769
EmployLevel3 0.3797 0.3733 0.0133 . 0.0065 0.0065 0.8413
EmployLevel4 0.0747 0.0922 -0.0666 . 0.0175 0.0175 0.5085
EmployLevel5 0.0111 0.0092 0.0176 . 0.0018 0.0018 0.1939
skill 0.9475 0.9410 0.0289 . 0.0065 0.0065 0.4668
sector0 0.1502 0.1779 -0.0774 . 0.0276 0.0276 0.3611
sector1 0.3641 0.3373 0.0555 . 0.0267 0.0267 0.8026
sector2 0.1742 0.1705 0.0097 . 0.0037 0.0037 0.7047
sector3 0.1677 0.1613 0.0173 . 0.0065 0.0065 0.6833
sector4 0.1438 0.1530 -0.0263 . 0.0092 0.0092 0.7092
work_12 0.7539 0.7263 0.0642 . 0.0276 0.0276 0.6248
nb_mthbf_12 4.9705 4.6876 0.0779 0.6686 0.0929 0.1585 0.9851
Outcome 0.4691 0.4387 0.0609 . 0.0304 0.0304 0.9253
Sample Sizes:
Control Treated
All 32617 1085
Matched 1085 1085
Unmatched 31280 0
Discarded 252 0
plot(summary(m.nn1))
summary(m.nn1$distance)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0003871 0.0049891 0.0141478 0.0321939 0.0327900 0.7005737
summary(pscores2)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0007632 0.0052455 0.0143247 0.0325513 0.0331730 0.6161660
# Obtain matched dataset from MatchIt output
m.mydata <- match.data(m.nn1)
head(m.mydata)
# Neyman's method for estimating causal effects is used to calculate the Average Treatment Effect on the Treated (ATT)
ney_match <- neyman(outcome = m.mydata$Outcome, treat = m.mydata$TREAT, alpha = 0.05)
ney_match
ATE Var int.lower int.upper
[1,] 0.03041475 0.0004569098 -0.01148036 0.07230985
# Regression
summary(lm(Outcome~TREAT, data=m.mydata))
Call:
lm(formula = Outcome ~ TREAT, data = m.mydata)
Residuals:
Min 1Q Median 3Q Max
-0.4691 -0.4691 -0.4387 0.5309 0.5613
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.43871 0.01511 29.025 <2e-16 ***
TREAT 0.03041 0.02138 1.423 0.155
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4979 on 2168 degrees of freedom
Multiple R-squared: 0.000933, Adjusted R-squared: 0.0004722
F-statistic: 2.025 on 1 and 2168 DF, p-value: 0.1549
I’ve used MatchIt function with the “nearest” method to perform nearest neighbor matching without replacement and Neyman’s method to estimate the ATT using the matched dataset. The balance assessment shows that the matching procedure has improved balance across covariates in the matched dataset.
The estimated ATT is approximately 0.0304 with a confidence interval spanning from -0.0115 to 0.0723, thus the treatment effect may be positive, negative or zero.
The linear regression results confirm the estimated treatment effect, but with the absence of statistical significance in the regression results suggests that there isn’t strong evidence to reject the null hypothesis that the treatment effect is zero.